View Javadoc
1   package edu.jiangxin.apktoolbox.pdf;
2   
3   import com.itextpdf.kernel.pdf.PdfDocument;
4   import com.itextpdf.kernel.pdf.PdfReader;
5   import com.itextpdf.kernel.pdf.PdfWriter;
6   import org.apache.commons.io.IOUtils;
7   import org.apache.logging.log4j.LogManager;
8   import org.apache.logging.log4j.Logger;
9   import org.apache.pdfbox.Loader;
10  import org.apache.pdfbox.pdmodel.PDDocument;
11  import org.apache.pdfbox.pdmodel.PDDocumentCatalog;
12  import org.apache.pdfbox.pdmodel.PDPage;
13  import org.apache.pdfbox.pdmodel.PDPageTree;
14  import org.apache.pdfbox.text.PDFTextStripper;
15  
16  import java.io.File;
17  import java.io.IOException;
18  
19  public class PdfUtils {
20      private static final Logger LOGGER = LogManager.getLogger(PdfUtils.class.getSimpleName());
21      public static boolean isScannedPdf(File file, int threshold) {
22          int length = 0;
23  
24          try (PDDocument document = Loader.loadPDF(file)) {
25              boolean isEncrypted = document.isEncrypted();
26              if (isEncrypted) {
27                  document.setAllSecurityToBeRemoved(true);
28              }
29  
30              PDFTextStripper stripper = new PDFTextStripper();
31              String text = stripper.getText(document).trim();
32              length = text.length();
33          } catch (IOException e) {
34              LOGGER.error("Error reading PDF file: {}", e.getMessage());
35              return false;
36          }
37          LOGGER.info("Processing file: {}, text size: {}", file.getPath(), length);
38          return length < threshold;
39      }
40  
41      public static boolean isEncryptedPdf(File file) {
42          boolean isEncrypted;
43  
44          try (PDDocument document = Loader.loadPDF(file)) {
45              isEncrypted = document.isEncrypted();
46          } catch (IOException e) {
47              LOGGER.error("Error reading PDF file: {}", e.getMessage());
48              return false;
49          }
50          LOGGER.info("Processing file: {}, is encrypted: {}", file.getPath(), isEncrypted);
51          return isEncrypted;
52      }
53  
54      public static boolean isNonOutlinePdf(File file) {
55          boolean hasOutline = false;
56  
57          try (PDDocument document = Loader.loadPDF(file)) {
58              boolean isEncrypted = document.isEncrypted();
59              if (isEncrypted) {
60                  document.setAllSecurityToBeRemoved(true);
61              }
62  
63              if (document.getDocumentCatalog() != null && document.getDocumentCatalog().getDocumentOutline() != null) {
64                  hasOutline = true;
65              }
66          } catch (IOException e) {
67              LOGGER.error("Error reading PDF file: {}", e.getMessage());
68              return false;
69          }
70          LOGGER.info("Processing file: {}, has outline: {}", file.getPath(), hasOutline);
71          return !hasOutline;
72      }
73  
74      public static boolean hasAnnotations(File file) {
75          boolean hasAnnotations = false;
76  
77          try (PDDocument document = Loader.loadPDF(file)) {
78              boolean isEncrypted = document.isEncrypted();
79              if (isEncrypted) {
80                  document.setAllSecurityToBeRemoved(true);
81              }
82              PDDocumentCatalog catalog = document.getDocumentCatalog();
83              if (catalog == null) {
84                  return false;
85              }
86              PDPageTree pages = document.getDocumentCatalog().getPages();
87              if (pages == null || pages.getCount() == 0) {
88                  return false;
89              }
90  
91              for (PDPage page : pages) {
92                  if (page.getAnnotations() != null && !page.getAnnotations().isEmpty()) {
93                      int pageNumber = page.getCOSObject().getInt("PageNumber", 0);
94                      String subType = page.getAnnotations().get(0).getSubtype();
95                      LOGGER.info("Found annotations on page: {}, subType: {}", pageNumber, subType);
96                      if (!subType.equals("Link")) {
97                          hasAnnotations = true;
98                          break; // No need to check further if we found annotations
99                      }
100                 }
101             }
102         } catch (IOException e) {
103             LOGGER.error("Error reading PDF file: {}", e.getMessage());
104             return hasAnnotations;
105         }
106         LOGGER.info("Processing file: {}, has annotations: {}", file.getPath(), hasAnnotations);
107         return hasAnnotations;
108     }
109 
110     public static void removePassword(File encryptedFile, File targetDir) {
111         try (PDDocument document = Loader.loadPDF(encryptedFile)) {
112             boolean isEncrypted = document.isEncrypted();
113             if (isEncrypted) {
114                 document.setAllSecurityToBeRemoved(true);
115             }
116             String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
117             document.save(targetFilePath);
118             LOGGER.info("Remove password success: {}", targetFilePath);
119         } catch (IOException e) {
120             LOGGER.error("Error processing PDF file: {}", e.getMessage());
121         }
122     }
123 
124     public static void removePasswordWithIText(File encryptedFile, File targetDir) {
125         String targetFilePath = targetDir.getAbsolutePath() + File.separator + encryptedFile.getName();
126         PdfReader reader = null;
127         PdfDocument pdfDoc = null;
128         try {
129             reader = new PdfReader(encryptedFile);
130             reader.setUnethicalReading(true);
131             PdfWriter writer = new PdfWriter(targetFilePath);
132             pdfDoc = new PdfDocument(reader, writer);
133         } catch (IOException e) {
134             LOGGER.error("Error processing PDF file: {}", e.getMessage());
135         } finally {
136             IOUtils.closeQuietly(pdfDoc);
137             IOUtils.closeQuietly(reader);
138         }
139     }
140 
141     public static int getPageCount(File file) {
142         int pageCount = 0;
143 
144         try (PDDocument document = Loader.loadPDF(file)) {
145             boolean isEncrypted = document.isEncrypted();
146             if (isEncrypted) {
147                 document.setAllSecurityToBeRemoved(true);
148             }
149             pageCount = document.getNumberOfPages();
150         } catch (IOException e) {
151             LOGGER.error("Error reading PDF file: {}", e.getMessage());
152             return 0;
153         }
154         LOGGER.info("Processing file: {}, page count: {}", file.getPath(), pageCount);
155         return pageCount;
156     }
157 }